{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Final results for Enron-email dataset\n",
    "In this notebook, we run the VI algorithm to generate the results for Enron email dataset in the paper."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1: Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import time\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "import torch\n",
    "from varhawkes.utils import metrics\n",
    "from tick.hawkes.inference import HawkesADM4\n",
    "from multiprocessing import Process\n",
    "import multiprocessing\n",
    "import bisect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch import optim\n",
    "from varhawkes import models\n",
    "from varhawkes import posteriors\n",
    "from varhawkes import priors\n",
    "from varhawkes import hawkes_model, excitation_kernels\n",
    "from varhawkes.learners import VariationalInferenceLearner\n",
    "from experiment.util import LearnerCallback"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: Read data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(143, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>timestamps</th>\n",
       "      <th>num_events_m</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>allen-p</td>\n",
       "      <td>[816180.0, 818700.0, 818940.0, 1078200.0, 1082...</td>\n",
       "      <td>501</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>arnold-j</td>\n",
       "      <td>[5336760.0, 5337000.0, 6535320.0, 7460940.0, 7...</td>\n",
       "      <td>1112</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>arora-h</td>\n",
       "      <td>[932280.0, 30199440.0, 32030640.0, 33507360.0,...</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>badeer-r</td>\n",
       "      <td>[14145420.0, 14202360.0, 14310660.0, 14382600....</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>bailey-s</td>\n",
       "      <td>[5141940.0, 5660700.0, 5661660.0, 8087640.0, 8...</td>\n",
       "      <td>187</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      index                                         timestamps  num_events_m\n",
       "0   allen-p  [816180.0, 818700.0, 818940.0, 1078200.0, 1082...           501\n",
       "1  arnold-j  [5336760.0, 5337000.0, 6535320.0, 7460940.0, 7...          1112\n",
       "2   arora-h  [932280.0, 30199440.0, 32030640.0, 33507360.0,...            66\n",
       "3  badeer-r  [14145420.0, 14202360.0, 14310660.0, 14382600....            50\n",
       "4  bailey-s  [5141940.0, 5660700.0, 5661660.0, 8087640.0, 8...           187"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ts = pd.read_json('enron_dataset_timeseries.json')\n",
    "df_ts['timestamps'] = df_ts['timestamps'].apply(np.array)\n",
    "df_ts['timestamps'] = df_ts['timestamps'].apply(np.unique)\n",
    "\n",
    "# Filter out events before Jan 1, 2000\n",
    "min_timestamp = time.mktime(datetime(2000,1,1).timetuple())\n",
    "df_ts['timestamps'] = df_ts['timestamps'].apply(lambda events_m: events_m[events_m > min_timestamp] - min_timestamp)\n",
    "\n",
    "# Remove the nodes with too few observations (less than 10 emails)\n",
    "df_ts['num_events_m'] = df_ts['timestamps'].apply(len)\n",
    "df_ts = df_ts[df_ts.num_events_m > 10] \n",
    "df_ts = df_ts.reset_index()\n",
    "\n",
    "print(df_ts.shape)\n",
    "df_ts.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Scale the time to day-scale"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "end_time: 906.9444212962962\n"
     ]
    }
   ],
   "source": [
    "events = list(map(np.array, df_ts.timestamps.values))\n",
    "\n",
    "scale = 1.0 / (3600 * 24)\n",
    "end_time = max(map(max, events)) * scale\n",
    "\n",
    "print('end_time:', end_time)\n",
    "for m, events_m in enumerate(events):\n",
    "    events_m = np.unique(np.sort(events_m))\n",
    "    events_m = events_m * scale\n",
    "    events[m] = events_m\n",
    "start_time = min(map(min, events))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Split the data to test & train with proportion 70-30 %"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_thresh = (end_time-start_time) * 0.7 + start_time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_events(events, Thr=train_thresh):\n",
    "    events_train, events_test = list(), list()\n",
    "    for i, events_i in enumerate(events):\n",
    "        id_ = bisect.bisect_right(events_i, train_thresh)\n",
    "        events_train.append(events_i[:id_])\n",
    "        events_test.append(events_i[id_:]-train_thresh)\n",
    "    return events_train, events_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of train datapoints: 53392\n",
      "Number of test datapoints: 20902\n"
     ]
    }
   ],
   "source": [
    "events_train, events_test = get_events(events, Thr=train_thresh)\n",
    "end_time_train = max([max(nums) for nums in events_train if len(nums)>0])\n",
    "end_time_test = max([max(nums) for nums in events_test if len(nums)>0])\n",
    "num_test = sum(map(len,events_test))\n",
    "print(f'Number of train datapoints: {sum(map(len,events_train))}')\n",
    "print(f'Number of test datapoints: {num_test}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "## Inference with VI-SG"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the hyper-parameters of the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "M, T = 10, 5\n",
    "torch.manual_seed(123456789)\n",
    "np.random.seed(123465789)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Transofrm the data to torch tensor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "events_train, events_test = get_events(events, Thr=train_thresh)\n",
    "end_time_train = max([max(nums) for nums in events_train if len(nums)>0])\n",
    "end_time_test = max([max(nums) for nums in events_test if len(nums)>0])\n",
    "events_test = [torch.tensor(events_m, dtype=torch.float64) for events_m in events_test]\n",
    "events_train = [torch.tensor(events_m, dtype=torch.float64) for events_m in events_train]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Train the VI-SG model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialize cache 20449/20449     \n",
      "iter: 15000 | f1-score: 0.07 | relerr: 0.074 | p@20-50-100: 0.00 0.02 0.01 | dx: 9.39e-02     "
     ]
    },
    {
     "data": {
      "text/plain": [
       "tensor([-12.4626, -12.6971, -11.3088,  ...,   0.2667,  -0.1214,   0.1084],\n",
       "       dtype=torch.float64, requires_grad=True)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hawkes_model_obj = hawkes_model.HawkesModel(excitation_obj, verbose=True)\n",
    "posterior_obj = posteriors.LogNormalPosterior()\n",
    "n_params = n_nodes * (n_nodes * M + 1)\n",
    "C = 1.0 * torch.ones(n_params, dtype=torch.float64)\n",
    "prior_obj = priors.GaussianLaplacianPrior(n_nodes, n_params, C)\n",
    "model = models.ModelHawkesVariational(\n",
    "    model=hawkes_model_obj, posterior=posterior_obj, prior=prior_obj, n_samples=1, n_weights=1, weight_temp=1e0,\n",
    ")\n",
    "x = torch.tensor(\n",
    "    np.hstack((\n",
    "        np.hstack((  # alpha, the mean of the parameters\n",
    "            np.random.normal(loc=0.1, scale=0.1, size=n_nodes),\n",
    "            np.random.normal(loc=0, scale=0.1, size=M * n_nodes ** 2),)),\n",
    "        np.hstack((  # beta=log(sigma), log of the variance of the parameters    \n",
    "            np.log(np.clip(np.random.normal(loc=1.0, scale=0.1, size=n_nodes), 1e-1, 2.0)),\n",
    "            np.log(np.clip(np.random.normal(loc=1.0, scale=0.1, size=M * n_nodes ** 2), 1e-1, 2.0)),))\n",
    "    )),\n",
    "    dtype=torch.float64, requires_grad=True\n",
    ")\n",
    "\n",
    "learner = VariationalInferenceLearner(model=model, optimizer=optim.Adam([x], lr=0.02), tol=1e-4, max_iter=15e3,\n",
    "                                      hyperparam_momentum=0.5, hyperparam_interval=100,hyperparam_offset=0)\n",
    "learner.fit(events_train, end_time_train, x, callback=callback)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test the model\n",
    "Cache the test data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialize cache 20449/20449     \n"
     ]
    }
   ],
   "source": [
    "excitation_obj = excitation_kernels.MixtureGaussianFilter(M=M, end_time=T, cut_off=1500.0)\n",
    "model_likelihood_SG = hawkes_model.HawkesModel(excitation_obj,verbose=True)\n",
    "model_likelihood_SG.set_data(events_test, end_time=end_time_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Testing the model on test data using the mean of approximate posterior"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "M : 10 | T : 5 | log_likelihood (mean) : -0.19274145427525693\n"
     ]
    }
   ],
   "source": [
    "alpha, beta = learner.coeffs.data[:n_params], learner.coeffs.data[n_params:]\n",
    "z = learner.model.posterior.mean(alpha, beta)\n",
    "mu_vi = z[:n_nodes].cpu()\n",
    "adj_vi = z[n_nodes:].cpu().reshape(n_nodes, n_nodes, learner.model.model.excitation.M)\n",
    "test_likelihood = model_likelihood_SG.log_likelihood(mu_vi,adj_vi)\n",
    "\n",
    "print(f'M : {M} | T : {T} | log_likelihood (mean) : {test_likelihood/ num_test}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Testing the model on test data using the mode of approximate posterior"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "M : 10 | T : 5 | log_likelihood (mode): -0.37269375464954696\n"
     ]
    }
   ],
   "source": [
    "z = learner.model.posterior.mode(alpha, beta)\n",
    "mu_vi = z[:n_nodes].cpu()\n",
    "adj_vi = z[n_nodes:].cpu().reshape(n_nodes, n_nodes, learner.model.model.excitation.M)\n",
    "\n",
    "test_likelihood = model_likelihood_SG.log_likelihood(mu_vi,adj_vi)\n",
    "print(f'M : {M} | T : {T} | log_likelihood (mode): {test_likelihood/ num_test}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "## Inference with VI-EXP"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the hyper-parameters of the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "M = 1\n",
    "decay = 20"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Cache the testing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialize cache 20449/20449     \n"
     ]
    }
   ],
   "source": [
    "excitation_obj = excitation_kernels.ExponentialKernel(decay=decay, cut_off=1500.0)\n",
    "model_likelihood = hawkes_model.HawkesModel(excitation_obj,verbose=True)\n",
    "model_likelihood.set_data(events_test, end_time=end_time_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "excitation_obj = excitation_kernels.ExponentialKernel(decay=decay, cut_off=1500.0)\n",
    "hawkes_model_obj = hawkes_model.HawkesModel(excitation_obj, verbose=True)\n",
    "posterior_obj = posteriors.LogNormalPosterior()\n",
    "n_params = n_nodes * (n_nodes * M + 1)\n",
    "C = 1.0 * torch.ones(n_params, dtype=torch.float64)\n",
    "prior_obj = priors.GaussianLaplacianPrior(n_nodes, n_params, C)\n",
    "\n",
    "model = models.ModelHawkesVariational(\n",
    "    model=hawkes_model_obj, posterior=posterior_obj, prior=prior_obj, n_samples=1, n_weights=1, weight_temp=1e0,\n",
    ")\n",
    "x = torch.tensor(\n",
    "    np.hstack((\n",
    "        np.hstack((  # alpha, the mean of the parameters\n",
    "            np.random.normal(loc=0.1, scale=0.1, size=n_nodes),\n",
    "            np.random.normal(loc=0, scale=0.1, size=M * n_nodes ** 2),)),\n",
    "        np.hstack((  # beta=log(sigma), log of the variance of the parameters    \n",
    "            np.log(np.clip(np.random.normal(loc=1.0, scale=0.1, size=n_nodes), 1e-1, 2.0)),\n",
    "            np.log(np.clip(np.random.normal(loc=1.0, scale=0.1, size=M * n_nodes ** 2), 1e-1, 2.0)),))\n",
    "    )),\n",
    "    dtype=torch.float64, requires_grad=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialize cache 20449/20449     \n",
      "iter: 20000 | f1-score: 0.05 | relerr: 0.123 | p@20-50-100: 0.00 0.02 0.02 | dx: 4.58e-02    "
     ]
    },
    {
     "data": {
      "text/plain": [
       "tensor([-3.4036, -2.0673, -6.5648,  ..., -0.2367,  0.0326, -1.7203],\n",
       "       dtype=torch.float64, requires_grad=True)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "learner = VariationalInferenceLearner(model=model, optimizer=optim.Adam([x], lr=0.02), tol=1e-4, max_iter=2e4,\n",
    "                                      hyperparam_momentum=0.5, hyperparam_interval=100,hyperparam_offset=0)\n",
    "learner.fit(events_train, end_time_train, x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Testing the model on test data using the mode of approximate posterior"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha, beta = learner.coeffs.data[:n_params], learner.coeffs.data[n_params:]\n",
    "z = learner.model.posterior.mode(alpha, beta)\n",
    "mu_vi = z[:n_nodes].cpu()\n",
    "adj_vi = z[n_nodes:].cpu().reshape(n_nodes, n_nodes, learner.model.model.excitation.M)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Likelihood of VI (mode of the posterior): -0.19476225280328086\n"
     ]
    }
   ],
   "source": [
    "test_likelihood = model_likelihood.log_likelihood(mu_vi,adj_vi)\n",
    "print(f'Test Likelihood of VI (mode of the posterior): {test_likelihood/num_test}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Testing the model on test data using the mean of approximate posterior"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "alpha, beta = learner.coeffs.data[:n_params], learner.coeffs.data[n_params:]\n",
    "z = learner.model.posterior.mean(alpha, beta)\n",
    "mu_vi = z[:n_nodes].cpu()\n",
    "adj_vi = z[n_nodes:].cpu().reshape(n_nodes, n_nodes, learner.model.model.excitation.M)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Likelihood of VI (mean of the posterior): -0.14706493559708186\n"
     ]
    }
   ],
   "source": [
    "test_likelihood = model_likelihood.log_likelihood(mu_vi,adj_vi)\n",
    "print(f'Test Likelihood of VI (mean of the posterior): {test_likelihood/num_test}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
